## Loading required package: magrittr
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.8
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
No differential analysis was performed, but this report contains expression information and quality control for the samples listed below.
Data is processed from gene expression estimates derived from salmonCounts.
Gene expression estimates are normalized and compared between conditions using DEseq2. For more information on the analysis you can review the code in this document and visit our training material at https://rockefelleruniversity.github.io
The code required to recreate the analysis and figures of this report can be seen by clicking on the dropdown headers with the arrows next to the text.
The input files from salmonCounts are shown in the following table alongside the sample names and any group information. This table can also be found in the NA/_sampleTable.csv csv file.
## INFO [2023-11-14 13:33:31] Loading counts...
Numerous input parameters that were entered into the app are available and used to build this report. Click on the dropdown below to see the required parameters to run the code througout this report.
Setting up the working directory
Set the required parameters
The analysis performed in the document uses several parameters set during the analysis in ShinyNGSpipeR.
Maybe we could add description of parameters
To set these parameters for your own analysis please copy and run the below section of code into your R session.
```{r} alphaMin <- 0L user_output_name <- “NA” render_output_dir <- “NA” project_outdir_name <- “NA” project_output_path <- “NA” report_output_dirpath <- “NA” report_output_dirname <- “NA” IDtoSymbol_report_path <- “NA” transcriptToGene_report_path <- “NA” species <- “NA” TestCondition <- “NA” baseLineCondition <- “NA” padjCutOff <- “NA” logFC_Down <- “NA” logFC_Up <- “NA” column_for_analysis <- “NA” pca_color_group_select <- “NA” pca_ntop <- “NA” select_sample_heatmap <- “NA” heatmap_cluster_number <- “NA” cols_for_anno <- “NA” DE_clicked_genes <- “NA” DE_genes_of_interest <- NULL highlighted_genes_status <- “NA” goi_genes_status <- “NA” cluster_select_over_rep_input <- “NA” cluster_select_over_rep_list <- “NA” gsea_selected_terms <- “NA” over_rep_active_categories <- NULL gsea_active_categories <- NULL MA_all_DE_col <- “NA” MA_all_nonDE_col <- “NA” MA_goi_col <- “NA” MA_non_goi_col <- “NA” volcano_all_DE_col <- “NA” volcano_all_nonDE_col <- “NA” volcano_goi_col <- “NA” volcano_non_goi_col <- “NA” cluster_color_list <- “NA” ht_cluster_color_list <- “NA” top_anno_colors <- “NA” colors_for_pheatmap <- “NA” pca_group_colors <- “NA” dist_group_colors <- “NA”
**Set the required input file paths.**
The analysis will also require the user-supplied input data, sample/group metadata information and the genome build specific reference data files
All required input files are within the result directory and you can set the file paths for your own analysis by copying and running the below section of code into your R session.
```{r}
tx2geneFile <- 'NA'
annoSym <- 'NA'
sampleSheet <- 'NA_sampleTable.csv'
QCSheet <- 'NA_QCsummary.csv'
The Report directory contains output from a differential analysis including -
Load count data and prepare for DESeq2 differential expression analysis
metaData <- sampleSheet %>% select(-CountFiles)
# we need to load the data into a DESeq object differently is ther is not two groups, so we check that here
if(length(levels(as.factor(metaData$Group))) < 2){
atLeast_two_groups <- FALSE
}else{
atLeast_two_groups <- TRUE
}
# DESeq cant run if we don't have at least one group with more than one replicate, so here we check
number_of_reps <- table(metaData$Group)
if(var(number_of_reps) == 0 & number_of_reps[1] == 1){
atLeast_one_replicate <- FALSE
}else{
atLeast_one_replicate <- TRUE
}
countFiles <- sampleSheet %>% pull(CountFiles)
names(countFiles) <- rownames(sampleSheet)tx2gene <- read.delim(transcriptToGene_report_path,sep=",",header = TRUE)
txi <- tximport(files = countFiles, type="salmon", tx2gene=tx2gene)
if(atLeast_two_groups){
dds <- DESeqDataSetFromTximport(txi,colData = metaData,design = ~Group)
}else{
dds <- DESeqDataSetFromTximport(txi,colData = metaData,design = ~1)
}The Heatmap below shows an overview of the correlation between gene expression profiles for the samples under investigation. Samples with greater similarity in gene expression profiles will be clustered closer together in the heatmap and have smaller values for between samples distances (indicated by darker blue).
Create the sample-sample heatmap
sampleDists <- dist(t(assay(normExprs)))
sampleDistMatrix <- as.matrix(sampleDists)
colData_anno <- as.data.frame(colData(normExprs))
colnames(colData_anno)[colnames(colData_anno) == "Group"] <- column_for_analysis
rownames(sampleDistMatrix) <- colnames(normExprs)
colnames(sampleDistMatrix) <- colnames(normExprs)
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
color_pal1 <- pals::alphabet()
color_pal2 <- pals::alphabet()[c(14:26, 1:13)]
names(color_pal2) <- NULL
color_pal_ht <- rep(c("blue", "#EEEEEE", "red"), 10)
dist_group_colors <- list()
dist_group_colors[[column_for_analysis]] <- color_pal2[1:nlevels(colData_anno[[column_for_analysis]])]
names(dist_group_colors[[column_for_analysis]]) <- levels(colData_anno[[column_for_analysis]])
# samp_ht <- pheatmap::pheatmap(sampleDistMatrix,
# clustering_distance_rows=sampleDists,
# clustering_distance_cols=sampleDists,
# annotation_col=as.data.frame(colData(normExprs)) %>% dplyr::select(Group),
# col=colors,
# angle_col = 45)
#
# samp_ht
samp_ht <- ComplexHeatmap::pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors,
angle_col = "45",
main = "Distance matrix for expression of all genes",
heatmap_legend_param = list(title = "Distance\n(Eucl.)",
title_gp = gpar(fontsize = 10, fontface = "bold")),
top_annotation = HeatmapAnnotation(df = colData_anno, col = dist_group_colors))The Principal Component Analysis below displays the separation of samples along the major sources of variation in the data.
pcaData <- plotPCA(normExprs, intgroup=c("Group"), ntop = pca_ntop, returnData=TRUE)
colnames(pcaData)[colnames(pcaData) == "Group"] <- column_for_analysis
percentVar <- round(100 * attr(pcaData, "percentVar"))load("forPCA.RData")
qc_summary_table <- key_stats
pcaData_merge <- merge(pcaData, qc_summary_table, by.x = 0, by.y = 1) # here we use the qc summary tabel from the app, but we do make it below from scratch, just need this here, so this is easiesit for now. We could make this table here if we want and use below
pcaData_merge$color_col <- pcaData_merge[[pca_color_group_select]]
if(pca_color_group_select %in% c("TOTAL_READS", "TOTAL_READS_PAIR")){
pcaData_merge$color_col <- gsub(",", "", pcaData_merge$color_col)
pcaData_merge$color_col <- as.numeric(pcaData_merge$color_col)
}
if (identical(pcaData_merge[[column_for_analysis]], pcaData_merge$color_col)){
shape = NULL
}else{
shape = column_for_analysis
}
p = ggplot(pcaData_merge, aes_string("PC1", "PC2", shape=shape, color = "color_col", label = "name")) +
geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed() +
ggtitle("PCA plot of gene expression") +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
theme_bw() +
scale_shape_manual(values=1:nlevels(pcaData_merge[[column_for_analysis]]))
# if(is(pcaData_merge$color_col, "factor") | is(pcaData_merge$color_col, "character")){
# p = p + guides(color = guide_legend(title = pca_color_group_select,
# title.theme = element_text(size=8)),
# shape = guide_legend(title = NULL)) +
# scale_colour_manual(values = pca_group_colors)
#
# }
# if(is(pcaData_merge$color_col, "numeric")){
# p = p + guides(color = guide_colourbar(title = pca_color_group_select,
# title.theme = element_text(size=8)),
# shape = guide_legend(title = NULL)
# )
# }
##########
# p <- plotPCA(normExprs,
# ntop=500,
# intgroup="Group") +
# theme_bw() +
# ggtitle(paste0("PCA for top500 most variable genes in ",params$TestCondition," and ",params$baseLineCondition," samples"))No differential expression analysis was performed, this can be done by clicking the ‘Run differential’ button within the ‘Differential Gene Expression’ tab.
No genes of interest were entered into the app. To do so, enter a list of genes in the appropriate text box of the ‘Differential Gene Expression Set Up’ section of the ‘Differential Gene Expression’ tab of the app
## INFO [2023-11-14 13:34:10] Rendering quality control...
Input files required by the pipeline (links contain information about each file format):
Sample data
Reference data
| Input_Files | Name |
|---|---|
| Genome | hg38 |
| Reference Fasta | dr11_BzippedWhite.fa.bgz |
| GTF | dr11_gBzipped.gtf.bgz |
| ID to Symbol map | DR11.geneToSymbol.txt |
Quality control files summarized in this report (links navigate to appropriate section of the report):
Processed files generated by the pipeline for each sample (links contain information about each file format):
The information from this report is derived from the reports generated for each sample. These reports have more detailed information on each sample, and the single sample reports can be accessed by clicking the links below.
Read quality is assessed using the BRC’s Rfastp R package which provides an R interface for the fastp and genocore C libraries.
In contrast to quality control through FastQC, Rfastp will calculate statistics across the complete FQ file instead of a subsample of reads. Additionally Rfastp includes assessment for read pairs instead of disjointed QC for each file of a pair.
Rfastp captures metrics on:
Fastp assesses each read based on a number of quality metrics, and determines whether that read meets a quality filtering threshold. Reads did not pass the quality filter if at least one of the following are true:
For each sequencing cycle (separated by read 1 or read 2 for paired-end), which corresponds to each position in the read, the proportion of G and C combined across all reads is shown.
For each sequencing cycle (separated by read 1 or read 2 for paired-end), which corresponds to each position in the read, the average phred quality score is shown. The quality score is equal to -10*log10(P), with P being the probability that a base is incorrect. So if the quality score is 30, this means there is a 0.1% chance (P = 0.001) that the base is incorrect.
The 25 most common Kmers (of length 5) across all sampels are shown in the heatmap below. The mean number of times that kmer is present in the reads for that sample is shown in the bar plot at the top of the heatmap. The z-score for each kmer across samples is plotted in the heatmap.
Picard and Java are included into the pipeline through installation by the BRC’s Herper package. Secondarily installs external software requirements in a dedicated Conda environment versions alongside the pipeline version.
The pipeline generates two separate Picard quality control results, AlignmentSummaryMetrics and RnaSeqMetrics.
The rate and number of aligned reads are shown in the figure below. Hover over each bar to see the actual numeric values.
Various Picard of metrics aligned reads are shown that are relevant to RNA-seq.
The bases within each read are annotated by the type of genomic region they were aligned to, and the proportion of bases for each region are shown. Hover over each bar to see the actual numeric values.
Each transcript is normalized to the same scale (0-100) and the relative coverage across the length of all transcripts is shown. Hover over each line to see more information.
The proportion of reads that were assigned to genic regions for the RNA seq analysis are shown below. Hover over each bar to see the actual numeric values.
| Package | Citation |
|---|---|
| Picard Tools | https://broadinstitute.github.io/picard/ |
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] aws.s3_0.3.21 futile.logger_1.4.3
## [3] gridExtra_2.3 GO.db_3.14.0
## [5] magick_2.7.3 org.Mm.eg.db_3.14.0
## [7] org.Hs.eg.db_3.14.0 enrichplot_1.14.2
## [9] clusterProfiler_4.2.2 msigdbr_7.5.1
## [11] ComplexHeatmap_2.10.0 highcharter_0.9.4
## [13] GlossyBRC_0.1.0 jsonlite_1.8.0
## [15] rjson_0.2.21 rio_0.5.29
## [17] plotly_4.10.0 RColorBrewer_1.1-3
## [19] htmltools_0.5.2 knitr_1.38
## [21] rmarkdown_2.13 kableExtra_1.3.4
## [23] ggrepel_0.9.1 DT_0.22
## [25] fgsea_1.20.0 goseq_1.46.0
## [27] geneLenDataBase_1.30.0 BiasedUrn_1.07
## [29] GSEABase_1.56.0 graph_1.72.0
## [31] annotate_1.72.0 XML_3.99-0.8
## [33] AnnotationDbi_1.56.2 pheatmap_1.0.12
## [35] tximport_1.22.0 crosstalk_1.2.0
## [37] DESeq2_1.34.0 SummarizedExperiment_1.24.0
## [39] Biobase_2.54.0 MatrixGenerics_1.6.0
## [41] matrixStats_0.61.0 GenomicRanges_1.46.1
## [43] GenomeInfoDb_1.30.1 IRanges_2.28.0
## [45] S4Vectors_0.32.3 BiocGenerics_0.40.0
## [47] forcats_0.5.1 stringr_1.4.0
## [49] dplyr_1.0.8 purrr_0.3.4
## [51] readr_2.1.2 tidyr_1.2.0
## [53] tibble_3.1.7 ggplot2_3.3.5
## [55] tidyverse_1.3.1 magrittr_2.0.2
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.3 rtracklayer_1.54.0 ragg_1.2.5
## [4] bit64_4.0.5 DelayedArray_0.20.0 data.table_1.14.2
## [7] KEGGREST_1.34.0 RCurl_1.98-1.6 doParallel_1.0.17
## [10] generics_0.1.2 GenomicFeatures_1.46.5 lambda.r_1.2.4
## [13] RSQLite_2.2.10 shadowtext_0.1.2 bit_4.0.4
## [16] tzdb_0.2.0 rlist_0.4.6.2 webshot_0.5.2
## [19] xml2_1.3.3 lubridate_1.8.0 assertthat_0.2.1
## [22] viridis_0.6.2 xfun_0.39 hms_1.1.1
## [25] jquerylib_0.1.4 babelgene_22.3 evaluate_0.15
## [28] fansi_1.0.2 restfulr_0.0.13 progress_1.2.2
## [31] dbplyr_2.1.1 readxl_1.3.1 igraph_1.3.0
## [34] DBI_1.1.2 geneplotter_1.72.0 quantmod_0.4.20
## [37] htmlwidgets_1.5.4 ellipsis_0.3.2 backports_1.4.1
## [40] biomaRt_2.50.3 vctrs_0.4.1 TTR_0.24.3
## [43] cachem_1.0.6 withr_2.5.0 aws.signature_0.6.0
## [46] ggforce_0.3.3 vroom_1.5.7 GenomicAlignments_1.30.0
## [49] treeio_1.18.1 xts_0.12.2 prettyunits_1.1.1
## [52] svglite_2.1.0 cluster_2.1.2 DOSE_3.20.1
## [55] ape_5.6-2 lazyeval_0.2.2 crayon_1.5.0
## [58] genefilter_1.76.0 labeling_0.4.2 pkgconfig_2.0.3
## [61] tweenr_1.0.2 nlme_3.1-153 pals_1.7
## [64] rlang_1.0.6 lifecycle_1.0.1 downloader_0.4
## [67] filelock_1.0.2 BiocFileCache_2.2.1 modelr_0.1.8
## [70] dichromat_2.0-0.1 cellranger_1.1.0 polyclip_1.10-0
## [73] Matrix_1.3-4 aplot_0.1.6 zoo_1.8-9
## [76] base64enc_0.1-3 reprex_2.0.1 GlobalOptions_0.1.2
## [79] png_0.1-7 viridisLite_0.4.0 bitops_1.0-7
## [82] Biostrings_2.62.0 blob_1.2.2 shape_1.4.6
## [85] qvalue_2.26.0 gridGraphics_0.5-1 scales_1.1.1
## [88] memoise_2.0.1 plyr_1.8.7 zlibbioc_1.40.0
## [91] scatterpie_0.1.7 compiler_4.1.2 BiocIO_1.4.0
## [94] clue_0.3-61 Rsamtools_2.10.0 cli_3.6.0
## [97] XVector_0.34.0 patchwork_1.1.1 formatR_1.12
## [100] MASS_7.3-54 mgcv_1.8-38 tidyselect_1.1.2
## [103] stringi_1.7.6 textshaping_0.3.6 highr_0.9
## [106] yaml_2.3.5 GOSemSim_2.20.0 locfit_1.5-9.4
## [109] sass_0.4.1 fastmatch_1.1-3 tools_4.1.2
## [112] parallel_4.1.2 circlize_0.4.15 rstudioapi_0.13
## [115] foreach_1.5.2 foreign_0.8-81 farver_2.1.0
## [118] ggraph_2.0.5 digest_0.6.29 Rcpp_1.0.8
## [121] broom_0.7.12 httr_1.4.2 colorspace_2.0-3
## [124] rvest_1.0.2 fs_1.5.2 splines_4.1.2
## [127] yulab.utils_0.0.4 tidytree_0.3.9 graphlayouts_0.8.0
## [130] mapproj_1.2.9 ggplotify_0.1.0 systemfonts_1.0.4
## [133] xtable_1.8-4 futile.options_1.0.1 ggtree_3.2.1
## [136] tidygraph_1.2.0 ggfun_0.0.6 R6_2.5.1
## [139] pillar_1.7.0 glue_1.6.2 fastmap_1.1.0
## [142] BiocParallel_1.28.3 codetools_0.2-18 maps_3.4.1
## [145] utf8_1.2.2 lattice_0.20-45 bslib_0.3.1
## [148] curl_4.3.2 zip_2.2.0 openxlsx_4.2.5
## [151] survival_3.2-13 munsell_0.5.0 DO.db_2.9
## [154] GetoptLong_1.0.5 GenomeInfoDbData_1.2.7 iterators_1.0.14
## [157] haven_2.4.3 reshape2_1.4.4 gtable_0.3.0